import pandas as pd
CSV : Comma Separated Values (.csv) File
A type of text File containing values delimited by or separated by comma
#Read the CSV by passing the URL
#drinks = pd.read_csv('https://andybek.com/pandas-drinks')
drinks = pd.read_csv('drinks.csv')
drinks.head(3)
#I only want two columns
pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'])
#I want "Country" to act as a Index for the DataFrame
pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'], index_col = 'country')
alcohol = pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'], index_col = 'country')
#The output is still a Dataframe and not a Series
alcohol.head()
type(alcohol)
Note : "pandas.read_csv" method always returns a Data Frame
But there is this "squeeze" method
squeeze : bool, default False : If the parsed data only contains one column then squeeze that DataFrame and return a Series.
alcohol = pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'], index_col = 'country' ,squeeze = True)
alcohol.head()
alcohol = pd.read_csv('drinks.csv' , usecols = ['country'],squeeze = True)
type(alcohol)
alcohol.head()
alcohol = pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'], index_col = 'country' ,squeeze = True)
alcohol.head()
alcohol.shape
#size attribute gives us the length of the series
#We ae looking at a Series of length 193
alcohol.size
alcohol #You can see the "Length" here
Remember : A Series consists of a sequence of values and associated Labels.
#We can access the values :
#That gives us an Array of values
alcohol.values
#Lets look at the associated Index
alcohol.index
alcohol.values.size == alcohol.index.size
Observation
Note : Series are only one Dimensional
alcohol.shape
Observation
alcohol.shape[0]
alcohol.size
alcohol.size == alcohol.shape[0]
len(alcohol)
.size : number of elements in the series
.shape : tuple of the dimensions for a Series (1D) shape, i.e, length for series
len() : Python built-in function
Check if a Series contains a Sequence of Unique values
#Look at the alcohol Series
alcohol.head()
alcohol.is_unique
#Checking uniqueness just for the head()
alcohol.head().is_unique
alcohol.head(15)
alcohol.head(15).is_unique
Observation
#For complete DataFrame
drinks.nunique()
alcohol.nunique()
Observation
#If you want to include the NA as well
alcohol.nunique(dropna = False)
#For complete DataFrame
drinks.nunique(dropna = False)
drinks.shape
#For complete DataFrame
#nunique count does not include NAN by default
drinks.nunique()
#we pass : dropna = False : so NANs are included if any
drinks.nunique(dropna = False)
#Notice in the below series , the values are always increasing
pd.Series([1,2,3]).is_monotonic
#Below is a series which is increasing and Stagnating
pd.Series([1,2,3, 3 , 3 , 3 ]).is_monotonic
#Below is a series which is increasing and Stagnating and increasing
pd.Series([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ]).is_monotonic
#Below is a series which is increasing and Stagnating and increasing and then we add a "7"
#Because at "7" it is a significant decrease , it is no longer Monotonic
pd.Series([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 , 7]).is_monotonic
Observation
#Below is a series which is increasing and Stagnating and increasing
pd.Series([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ]).is_monotonic_increasing
#Below is a series which is increasing and Stagnating and increasing
pd.Series([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ]).is_monotonic_decreasing
#When we use reversed()
reversed_list = list(reversed([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ]))
reversed_list
#Now we reverse the List and check for the Monotonicity
pd.Series(reversed([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ])).is_monotonic
pd.Series(reversed([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ])).is_monotonic_decreasing
pd.Series(reversed([1,2,3, 3 , 3 , 3 , 48 , 79 , 100 ])).is_monotonic_increasing
#Count the values of a Series
alcohol.count()
#Count the values of a DataFrame
drinks.count()
#But the Size of the Series was 193
#Then why is the count() method showing just 162
#What accounts for this difference?
alcohol.size
count() method only looks at values that are not Null
Series.count() : Returns the number of non-NA / null observations in the Series
import numpy as np
s = pd.Series([0.0 , 1.0 , np.nan])
s.count()
Series.size : Gives a count of all the values
Series.count() : Gives the count of non-NA values
#Lets check if the alcohol Series has Na
alcohol.hasnans
drinks.head()
drinks['beer_servings'].hasnans
drinks['country'].hasnans
#Lets check if the alcohol Series has Na
alcohol.hasnans
Observation So we verified that our Series has NaNs
#The Total number of values in the Series is
alcohol.size
#The number of non-Null values in the Series is
alcohol.count()
What Countries in our "alcohol" Series do not have Wine Servings?
drinks.head()
alcohol
alcohol.isnull()
drinks = pd.read_csv('drinks.csv')
drinks.head(2)
alcohol = pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'] , index_col = 'country' , squeeze = True)
alcohol.head(5)
alcohol.size
alcohol.count()
Nulls or NAa indicate the absence of a value
alcohol.isnull()
Observation
type(alcohol)
#Creating a Boolean Mask
#We get back another Series with only the countries with missing Wine Serving Data are selected
alcohol[alcohol.isnull()]
#Is we just want to look at the country names (isnull = True) or where the values are NAN
#Since Country is the index for alcohol series
#We get a Pandas object
alcohol[alcohol.isnull()].index
type(alcohol[alcohol.isnull()].index)
#Store it in a List
list(alcohol[alcohol.isnull()].index)
#One way to count the Nulls is to find the length of the Series that was produced above
#Simply wrap the above code in a len() function
#But this is not a pythonic way
len(list(alcohol[alcohol.isnull()].index))
#One of the most Pandorable way to get the count of NAN
alcohol.isnull().sum()
#Booleans are integers in Python : True = 1 and False = 0
sum([True , False , True]) #2 Trues and 1 False = 1 + 0 + 1
all = alcohol.size #Total length
nonnulls = alcohol.count() #count() does not include nulls
nulls = alcohol.isnull().sum()
print(all)
print(nonnulls)
print(nulls)
all == nonnulls + nulls
.size : Number of elements in the series
.count() : Number of non null elements
.isna().sum() : number of null elements
import pandas as pd
import numpy as np
This is another approach to isolate NULLS. A more elegant way.
Vectorization is the process of going from applying computations to each element (element wise computations) to running them on the entire collection at once.
So, Vectorization refers to the concept of running operations on entire arrays.
When we use Pandas , we are also relying on Numpy , therefore vectorization behind the scenes contributes to the performance gains relative to an approach that is based on regular Python loops, which are most commonly sequential
#unfunc ---> Universal Function
#We will look at np.isnan function
pd.Series(data=[True, False , None, 2] , dtype = float)
ser = pd.Series(data=[True, False , None, 2] , dtype = float)
#We pass this Series to the Numpy isnan() function
#We get back a Series of Booleans indicating whether each of the elements in the Series is null or not
np.isnan(ser)
np.isnan(ser).value_counts()
#Applying isnan() to alcohol Series
alcohol[np.isnan]
drinks.head(2)
np.isnan(drinks.wine_servings).value_counts()
drinks[np.isnan(drinks.wine_servings)]
drinks.columns
drinks[np.isnan(drinks.wine_servings)].shape
drinks[drinks['wine_servings'].isna()].shape
drinks[drinks['wine_servings'].isna()]
#To check the length of the resulting Series
alcohol[np.isnan].size
len(alcohol[np.isnan])
alcohol[np.isnan].shape
notnull() and notna()
But here , this time : "True" values are identified as notnull()
alcohol.notnull()
#We can use this resulting series as a Boolean mask in indexing for nulls only
alcohol.loc[alcohol.notnull()]
drinks[drinks['wine_servings'].notnull()]
#Get the sum of not null
alcohol.notnull().sum()
#Lets check if the count of not null + count of null == the size of the Series
alcohol.notnull().sum() + alcohol.isnull().sum() == alcohol.size
OR
print(drinks.wine_servings.notnull().sum())
print(drinks.wine_servings.isnull().sum())
print(drinks.wine_servings.size)
162 + 31
drinks.wine_servings.notnull().sum() + drinks.wine_servings.isnull().sum() == drinks.wine_servings.size
drinks.wine_servings.notna().sum() + drinks.wine_servings.isnull().sum() == drinks.wine_servings.size
The bool type inherits from (is a subclass of) int
bool --> int --> object
#Example :
True + 19
True + True - False + True * 3
1 + 1 - 0 + 1 * 3
type(True)
type(False)
wine_servings = alcohol[alcohol.notna()]
wine_servings.sum()
Hint : Apply a Boolean mask to identify such countries , than sum()
wine_servings.head(2)
wine_servings[wine_servings < 100].shape
wine_servings[wine_servings > 100].shape
wine_servings[wine_servings < 100].sum()
We want to completely exclude NAs from our series
#dropna() simply excludes NAs and returns a new Series
alcohol.dropna()
#One way re-assignment
#alcohol = alcohol.dropna()
#Another way : inplace parameter
#alcohol.dropna(inplace = True)
In the fillna() method we can specify what values we want to replace with and also pass the inplace parameter
#alcohol.fillna(100 , inplace = True)
alcohol.fillna(100 , inplace = False)
Note : Both .dropna() and .fillna() methods return a copy of the series unless we specify inplace = True
Metrics that allow us to characterise or Describe our Data
#We simply apply the sum() function to the Series itself
#Quick thing to note here is the NAs will be automatically excluded from this calculation
alcohol.sum()
Observation
But in itself , this number is not very meaningful. For example we don't know how this total consumption is distributed. How many countries are contributing to this Total and by how much , on Average ? what the Median is? and so on
#Average wine consumption across all the countries
#To calculate the Average we can make use of two functions
#We combine sum() with count()
#count() will give us the number of items which do not have null values
#By dividing sum by count we get the Average
alcohol.count()
alcohol.sum() / alcohol.count()
OR
alcohol.mean()
#Median is the middle most number
If we have a list containing Odd number of items like above , the median is simply the middle most element
But when we have an even list of numbers , we take the average of the two middlemost numbers
#median
alcohol.median()
Another way of looking at the Median is by ordering our Data and looking at the 50th Quantile . In Pandas we can do this by using the Quantile method.
alcohol.quantile(q = 0.5)
or
alcohol.quantile(0.5)
print("The MEAN in the alcohol Series is so much higher than the Median : " + str(alcohol.mean()) + " > " + str(alcohol.median()) )
This indicates that the Distribution of Wine_Servings is Right skewed or Positively skewed.
Positively skewed means , There are countries with large wine_servings that distort our mean but not much change our median.
Statistics is the study of how to collect, Organize , Analyze and interpret Numerical information and data.
Statistics is used in HEalthcare and other disciplines too to help aid in Decision making. Understanding statistics is necesary to understand certain processes in healthcare . (To figure out what to do ? not How we do?)
Descriptive Statistics invlove methods of organizing , picturing and Summarizing information from Samples and Populations.
Inferential Statistics invloves methods of using information from a Sample to draw conclusions regarding the Population.Therefore Inferential Statistics can only be done ON a Sample. We are trying to infer from the Sample.
Variables are of two types : Qualitative and Quantitative
Quantitative variables : Interval and Ratio
(Individuals could be anything : hospitals , people , students , banks , Products)
You have to measure the variables about these indivisuals . So, Step 3 is to "Specify all of the Variables" you will need to measure about these individuals (ofcourse they relate to the Hypothesis)
STep 4 is to determine whether you want to use the entire population in your study or just the Sample.
Now that you figured out your Hypothesis , you got your individuals , you got your variables. You figured out whether you are going to use a sample or a population. You selected your sampling method. Now the Step 5 is to "Address the ethical concerns before Data Collection" . Asking some sesitive Questions, you think about privacy. So, you have to sit down and think about these Ethical concerns.
Now its time for data Collection.
Remember: stating a Hypothesis is the most important part. without that , you will not even know what data to collect.
You either use Descriptive or Inferential Statistics to answer your Hypothesis.
Note any concerns about your Data Collection or Analysis. Make recommendations for future studies.
Example : STEPS 1 to 3
Because if you pick a Census , you will do a different kind of Analysis and if you pick a sample you will do a different kind of Analysis
Frequency Histogram is a Specific type of Bar chart made from data in a Frequency table.
Both Frequency Histograms and Relative Frequency Histograms are type of Bar Charts.
Histograms reveal the Distribution of Data.
This is how a Frequency Table looks like. Here we have added "RElative Frequency" column as well
Relative Frequency is nothing but the Proportion or percentage:
But ultimately you will get the same pattern in your Histogram , whether you use Frequency or Relative Frequency
Normal Distribution:
Uniform Distribution
Skewed Left Disttibution
Skewed Right Distribution
Bimodal Distribution
Its like keep accumulating things :
Pareto chart is a different kind of Special bar graph
Remember : "FREQUENCY" is always about Quantitative Data
Left looks more centery and right is Skewed
A lot of Datasets have no mode. There is just no repeated value in them.
import numpy as np
import pandas as pd
test_mode = pd.Series( [1,2,3,4,5])
test_mode.mode()
Is is also possible to have more than one MODE
test_mode = pd.Series( [1,2,2,3,4,5,3,4,5])
test_mode.mode()
What does the MODE tell you?
median_test = [42 , 33, 21 , 78 ,62]
sorted_median_test_ODD = sorted(median_test)
#This is the odd version
sorted_median_test_ODD
median_test = [12 , 7 , 3, 8 ,1 ,9]
sorted_median_test_EVEN = sorted(median_test)
sorted_median_test_EVEN
#We have ordered the Data from smallest to largest in both the lists
#sorted_median_test_EVEN and sorted_median_test_ODD
sorted_median_test_ODD
ODD = pd.Series(sorted_median_test_ODD)
ODD.median()
sorted_median_test_EVEN
EVEN = pd.Series(sorted_median_test_EVEN)
EVEN.median()
What it means is , 50% or half of the Data Points are below the MEDIAN and the other half are above.
And so, its also known as the Middle Rank of the Data
Whats nice about the MEDIAN is, it doesn't really care about the ends of the Data. And OUTLIERS don't really bother it. This way MEDIAN is very RESISTANT . Its very STABLE.
Here the MEDIAN is more resistant , its sort of hanging on to the bottom of the Data. But the Right TAIL is pulling the MEAN up and the MODE is the lowest one.
LEFT Skewed Distribution:
- Here the tail is dragging the MEAN down , notice the MEDIAN is more resistant , does not get dragged as much
And the MODE stays at the higher part of the Data where there is more Data
For categorical vaiables , we can look at the MODE
import pandas as pd
#Retrieve HTML table Data
url = 'https://www.basketball-reference.com/leagues/NBA_2019_per_game.html'
html = pd.read_html(url , header = 0) #0 row is the header
df2019 = html[0]
html
df2019
df2019[df2019.Age == 'Age']
df2019[df2019.Age == 'Age'].index #these are the indexes from which we have to drop
#Dropping all the headers that are Redundant
#Data Cleaning
raw = df2019.drop(df2019[df2019.Age == 'Age'].index)
raw.head()
#Data Dimension
raw.shape
#Check for missing values
raw.isnull().sum().head(4)
#Filling all the missing values with 0
df = raw.fillna(0)
#Write to CSV File
df.to_csv('nba2019.csv' , index = False)
#Type ls to check if the File has been created
%ls
#We read the Data back in
df = pd.read_csv('nba2019.csv')
df.shape
df.shape[0]
#We want no beaks between , we wamt to see all of the rows
pd.set_option('display.max_rows' , df.shape[0] + 1)
df
#Reverting back to the default
pd.set_option('display.max_rows' , 10)
df
#Overview of Datatypes of each column in the Dataframe
df.dtypes.head(5)
#Show specific Data types in our DataFrame
df.select_dtypes(include = ['number'])
#Show specific Data types in our DataFrame
df.select_dtypes(include = ['object'])
#df.select_dtypes(include=['datetime64'])
df.PTS.max()
#Filter : df ---> PTS == max()
#This is Conditional Selection
df[df.PTS == df.PTS.max()]
PlayerMaxPoints = df[df.PTS == df.PTS.max()]
PlayerMaxPoints.Tm
PlayerMaxPoints
PlayerMaxPoints.Pos
PlayerMaxPoints.G
df[df['PTS']>20]
df[df['3P'] == df['3P'].max()]
df[df['AST'] == df['AST'].max()]
#The below code groups the data by the TEAM >>> df.groupby('Tm')
#Then it will select the specific team that we want , that is LAL
LAL1 = df[df['Tm'] == 'LAL']
LAL1.shape
LAL.shape
LAL = df.groupby('Tm').get_group('LAL')
LAL
df.head()
OKC = df.groupby('Tm').get_group('OKC')
OKC.head()
#We first group by the Position
df.groupby('Pos').PTS.describe()
Observation
#First we will define a Variable called "Position"
#We will make a List of positions that we want to be shown
position = ['C' , 'PF' , 'SF' , 'PG' , 'SG']
POS = df[df['Pos'].isin(position)]
POS.head(5)
POS.Pos.unique()
#Now lets look at the Descriptive Statistics
#We are viewing Descritive statistics of only one column
POS.groupby('Pos').PTS.describe()
We will also try to answer this Question by showing some Hisogram Plots. So, to make it a bit easier lets create a Subset DataFrame.
#Here we are going to select the columns 'Pos' and 'PTS' ---> PTS= df[['Pos', 'PTS']]
#Then we have a list of only the Five positions we are interested in
#position = ['C' , 'PF' , 'SF' , 'PG' , 'SG']
PTS = df[['Pos' , 'PTS']]
position = ['C' , 'PF' , 'SF' , 'PG' , 'SG']
PTS = PTS[PTS['Pos'].isin(position)]
PTS
PTS.head(2)
PTS['PTS'].hist(by = PTS['Pos'])
Observation
#Letsa change the Layout
#This will show you one Row and Five columns
#We customize the fig size as well
PTS['PTS'].hist(by = PTS['Pos'] , layout =(1,5), figsize=(16,2))
#You can further go ahead and Customize the Bins
#We can do this using Seaborn
import seaborn as sns
import matplotlib.pyplot as plt
g = sns.FacetGrid(PTS , col = 'Pos')
g.map(plt.hist, "PTS");
PTS['PTS'].describe()
Here are the Scores of 15 Students
scores = [88 , 48, 60, 51, 57 , 85 , 69, 75, 97 , 72, 71 , 79, 65, 63 , 73]
len(scores)
scores = pd.Series(scores)
scores
sorted(scores)
scores.describe()
scores.median()
sorted(scores)
scores.hist()
scores = [88 , 48, 60, 51, 57 , 85 , 69, 75, 97 , 72, 71 , 79, 65, 63 , 73]
scores = scores * 2
scores = pd.Series(scores)
scores.hist()
drinks = pd.read_csv('drinks.csv')
drinks.head(2)
import pandas as pd
import numpy as np
alcohol = pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'], index_col = 'country')
alcohol.mean()
alcohol.median()
alcohol.hist()
len(alcohol)
alcohol.describe()
The mean in math and statistics summarizes an entire dataset with a single number representing the data’s center point or typical value. It is also known as the arithmetic mean, and it is the most common measure of central tendency. It is frequently called the “average.”
alcohol.mean()
alcohol.median()
alcohol.hist()
Observation
X-Axis : Will have values increasing from left to right
Y-Axis : will have frequencies or how many times those values show up in the distribution
scores = [1,1,1,2,2,2,2,4,4,4,4,4,5,5,5,5,5,5,6,7,9,10,11,12,12,12,13,14,15,
17,19,20,21,23,25,33,35,40,44,45,46,47,48]
scores = pd.Series(scores)
scores.mean()
scores.median()
scores.hist()
scores = [0,0.8,1,1.2,1.5,2,2.2,2.5,3,3.2,3.5,3.6,3.7,4,4.1,4.2,4.4,4.5,5,6]
scores = pd.Series(scores)
scores.mean()
scores.median()
scores.hist()
alcohol = pd.read_csv('drinks.csv' , usecols = ['country' , 'wine_servings'], index_col = 'country')
alcohol.quantile(0.5)
alcohol.hist()
Observation
Remember : As we said MEDIAN is the 50th Quantile
IQR is the difference between the First and Third Quartiles
alcohol.median()
#IQR is calculated as:
iqr = alcohol.quantile(0.75) - alcohol.quantile(0.25)
iqr
Observation
The InterQuartile Range measures the spread of the middle half of your data. It is the range for the middle 50% of your Sample. Use the IQR to assess the variability where most of your values lie.
Larger values indicate that the central portion of your data spread out further. conversely smaller values show that , the middle values cluster more tightly. It is also helpful to test whether your data is normally distributed.
InterQuartile Range is one of the serveral measures of variability:
To visualize the interquartile range, imagine dividing your data into quarters. Statisticians refer to these quarters as quartiles and label them from low to high as Q1, Q2, Q3, and Q4. The lowest quartile (Q1) covers the smallest quarter of values in your dataset. The upper quartile (Q4) comprises the highest quarter of values. The interquartile range is the middle half of the data that lies between the upper and lower quartiles. In other words, the interquartile range includes the 50% of data points that are above Q1 and below Q4.
IQR is less affected by Outliers. typically , use the IQR with a measure of Central Tendency , such as the MEDIAN to understand your Data's Center and Spread. This combination creates a fuller picture of your data's distribution.
Unlike the more familiar MEAN and Standard deviation, the IQR and Median are Robust measures. They are not influenced by the Outliers because they dont depend on every value. Additionally , like the MEDIAN , IQR is superb for skewed Distributions. For Normal Distributions , you can use the standard Deviation to determine the percentage of obsrervations that fall specific distances from the MEAN. For Skewed distributions , IQR is an excellent alternative.
IQR takes the Third Quartile Value and subtracts the First Quartile value. Equivalently , the InterQuartile Range is the region between the 75th and 25th Percentile (75 - 25 = 50% of the Data)
Using the IQR formula we need to find the values for Q3 and Q1. To do this, order your data from Low to high and split it into four Equal Proportions.
Q3 = alcohol.quantile(0.75)
Q1 = alcohol.quantile(0.25)
print(Q3)
print(Q1)
iqr = alcohol.quantile(0.75) - alcohol.quantile(0.25)
iqr
The Box in the Box plot is your InterQuartile Range. It contains 50% of your Data. By comparing the size of these boxes , you can understand your data's variability. More dispersed distributions have wider boxes.
Additionally , Find where the MEDIAN line falls within each InterQuartile box.If the MEDIAN is close to one side or the other of the Box , its a skewed distribution. When the MEDIAN is near the centre of the Interquartile range , your distribution is symmetric.
You can also find Outliers using IQR . Since IQR is robust and not influenced by Outliers:
alcohol.min()
alcohol.max()
alcohol.std()
Standard Deviation is just the Square Root of the Variance
alcohol.var()
alcohol.std()**2 == alcohol.var()
76.134917 ** 2
Almost everything that we previously calculated manually , could be obtained using this single method.
alcohol.describe()
This method returns a Pandas Series containing Descriptive Statistics and it is very convenient and Quick.
#We can specify we also need the 79th and the 19th Percentiles as well
alcohol.describe(percentiles = [0.79 , 0.19])
Observation
median is the data value in the data, above and below which there is an equal number of data points.there is an equal probability of falling above or below it.
Mean, median, and mode are three kinds of "averages". There are many "averages" in statistics, but these are, I think, the three most common, and are certainly the three you are most likely to encounter in your pre-statistics courses, if the topic comes up at all.
The "mean" is the "average" you're used to, where you add up all the numbers and then divide by the number of numbers. The "median" is the "middle" value in the list of numbers. To find the median, your numbers have to be listed in numerical order from smallest to largest, so you may have to rewrite your list before you can find the median. The "mode" is the value that occurs most often. If no number in the list is repeated, then there is no mode for the list.
Mode- It is the value which has highest frequency in population.
The median, just like the mean, is a single-number indicator of how a particular distribution is centered. When the distribution is symmetrical, the two numbers coincide. But when the distribution is skew (e.g. in income statistics) the median is a better indicator (gives a more realistic picture at a glance).
Median is the most robust estimator to outliers. Outlier is one huge problem in statistics since it happen all the time. When people record the data wrongly, when there are strange cases in the world (income). So median is good idea to look when that happens.
They are simple descriptive statistics - so 3 simple numbers tell you a fair amount about your data and you can use them in your reporting.
But don’t underestimate the value of plotting/graphing your data. This will give you pretty much the same information as those 3 statistics, and even more detail, e.g. it will tell you the range, give a visual indication of the quartiles and the presence/absence of outliers.
alcohol.describe()
#We can use the include and exclude parameter
#Filter data by type , include float data type and exclude object data type
alcohol.describe(include =float , exclude = object)
since "alcohol" dtype is just "float" , the above output did not change when we specified the include and exclude explicitly
type(alcohol)
mode() gives us the most common item or the value that occurs most frequently in a collection of values
So, the value that appears most number of times or the one that has the highest frequency in the series becomes the mode. Mode is the PEAK of the distribution.
Median is the middle most observation. In other words, its the point in our dataset that separates the dataset into two equal sized parts. And the MEAN is simply the Average value , which we calculate as the sum of all observations divided by the count of all observations.
#Calculate the mode
alcohol.mode()
Observation
#Lets say we are interested in figuring out how many times this value of 1 occurs.
#To answer this we use a basic Boolean Mask
#we use "==" to compare the integer value "1" to each value in the series
alcohol == 1
#Now we can use this boolean mask to index the alcohol series
alcohol[alcohol['wine_servings'] == 1]
alcohol[alcohol['wine_servings'] == 1].size
Observation
Calculates the number of occurances for each unique value all in one go
alcohol['wine_servings'].value_counts()
Oservation
#The first value is oocuring how many times
alcohol['wine_servings'].value_counts().iloc[0]
alcohol['wine_servings'].value_counts().values
len(alcohol['wine_servings'].value_counts())
alcohol['wine_servings'].value_counts().tail()
alcohol['wine_servings'].value_counts()
vc = pd.DataFrame(alcohol['wine_servings'].value_counts())
vc.index
#This is the most frequent value or the MODE
vc.index[0]
len(vc.index)
#This is the least frequent value
vc.index[70]
vc.index[len(vc.index)-1]
By default "normalize" is set to "False", but when we set to True , we get the relative frequencies of the unique values in the series .
alcohol['wine_servings'].value_counts(normalize = True)
Observation
alcohol.max()
But which country this is ?
#We can set a Boolean Mask
alcohol[alcohol['wine_servings'] == alcohol['wine_servings'].max()]
alcohol['wine_servings'].hist()
#But what we get here is a Series and not the country name separately
alcohol[alcohol['wine_servings'] == alcohol['wine_servings'].max()]
type(alcohol[alcohol['wine_servings'] == alcohol['wine_servings'].max()])
#we can get the country name by isolating the "Index" component of this series
alcohol[alcohol['wine_servings'] == alcohol['wine_servings'].max()]
drinks.head(2)
alcohol = pd.Series(drinks['wine_servings'].values, index=drinks['country'])
alcohol
#we can get the country name by isolating the "Index" component of this series
#This returns a Pandas index object
alcohol[alcohol == alcohol.max()].index
#To get the country name
alcohol[alcohol == alcohol.max()].index[0]
The above procedure is very confusing. The same thing can be done using the idx method.
alcohol.idxmax()
#This is the Series
alcohol.max()
#This is the Dataframe
drinks['wine_servings'].max()
alcohol[alcohol == alcohol.max()]
drinks['wine_servings'].idxmax()
drinks['wine_servings'].iloc[136]
drinks.iloc[drinks['wine_servings'].idxmax()]
drinks.loc[drinks['wine_servings'] == drinks['wine_servings'].max()]
alcohol.idxmin()
alcohol.min()
we have seen in the value_counts , there are 28 countries with a min value of 1
alcohol.value_counts().head(1)
But what we see in "alcohol.idxmin" is only one of those countries
alcohol.idxmin()
#This is the actual result
alcohol[alcohol == alcohol.min()]
idxmin() and idxmax() are very convenient methods that will return only the First label associated with the max and min values repectively
alcohol.idxmax()
#we can simply enter this index value to get the row
alcohol['Portugal']
drinks.head(2)
drinks.iloc[0]
⚫⚫🛑➖⚫🛑⚫⚫➖⚫⚫🛑➖⚫🛑⚫⚫⚫⚫🛑➖⚫🛑⚫⚫⚫⚫🛑➖⚫🛑⚫⚫⚫⚫🛑➖⚫🛑⚫⚫⚫⚫🛑➖⚫🛑⚫⚫